파일 읽어들이기

The Center for World University Rankings (CWUR) 에서 제공하는 전세계 대학 순위 정보. (https://cwur.org/) 2012년 ~2018년 대학별 순위, 국내순위, 교육의 질, 졸업생 취업, 단과대학의 질, 출판물, 영향력, 인용, 특허, 총 점수 등의 정보를 제공.

CWUR uses seven objective and robust indicators to rank the world’s top 1000 universities:

  1. Quality of Education, measured by the number of a university’s alumni who have won major international awards, prizes, and medals relative to the university’s size (15%)
  2. Alumni Employment, measured by the number of a university’s alumni who have held CEO positions at the world’s top companies relative to the university’s size (15%)
  3. Quality of Faculty, measured by the number of academics who have won major international awards, prizes, and medals (15%)
  4. Research Output, measured by the the total number of research papers (15%)
  5. Quality Publications, measured by the number of research papers appearing in top-tier journals (15%)
  6. Influence, measured by the number of research papers appearing in highly-influential journals (15%)
  7. Citations, measured by the number of highly-cited research papers (10%)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
data <- read.csv("data/cwur_data.csv", stringsAsFactors = F)
str(data)
## 'data.frame':    5200 obs. of  12 variables:
##  $ world_rank          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ institution         : chr  "Harvard University" "Massachusetts Institute of Technology" "Stanford University" "University of Cambridge" ...
##  $ country             : chr  "USA" "USA" "USA" "United Kingdom" ...
##  $ national_rank       : int  1 2 3 1 4 5 2 6 7 8 ...
##  $ quality_of_education: int  7 9 17 10 2 8 13 14 23 16 ...
##  $ alumni_employment   : int  9 17 11 24 29 14 28 31 21 52 ...
##  $ quality_of_faculty  : int  1 3 5 4 7 2 9 12 10 6 ...
##  $ publications        : int  1 12 4 16 37 53 15 14 13 6 ...
##  $ influence           : int  1 4 2 16 22 33 13 6 12 5 ...
##  $ citations           : int  1 4 2 11 22 26 19 15 14 3 ...
##  $ score               : num  100 91.7 89.5 86.2 85.2 ...
##  $ year                : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
sapply(data, function(x){sum(is.na(x))})
##           world_rank          institution              country 
##                    0                    0                    0 
##        national_rank quality_of_education    alumni_employment 
##                    0                  597                    0 
##   quality_of_faculty         publications            influence 
##                  731                    0                    0 
##            citations                score                 year 
##                    0                    0                    0
#Hmisc::describe(data)

Visualize correlation matrix using correlogram

Data for correlation analysis

  • 상관분석을 위해서는 연속형(수치형)변수로 구성된 매트릭스이어야하고, 변수내에 결측치가 없어야 함.
# 범주형 변수인 2열과 3열을 제외함
data1 <- data[, -c(2,3)]
# 결측치가 존재하는 2018년 데이터를 제외함
data1 <-data1[data1$year != 2018, ]
# 결측치 갯수 확인 
sum(is.na(data1))
## [1] 0
str(data1)
## 'data.frame':    4200 obs. of  10 variables:
##  $ world_rank          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ national_rank       : int  1 2 3 1 4 5 2 6 7 8 ...
##  $ quality_of_education: int  7 9 17 10 2 8 13 14 23 16 ...
##  $ alumni_employment   : int  9 17 11 24 29 14 28 31 21 52 ...
##  $ quality_of_faculty  : int  1 3 5 4 7 2 9 12 10 6 ...
##  $ publications        : int  1 12 4 16 37 53 15 14 13 6 ...
##  $ influence           : int  1 4 2 16 22 33 13 6 12 5 ...
##  $ citations           : int  1 4 2 11 22 26 19 15 14 3 ...
##  $ score               : num  100 91.7 89.5 86.2 85.2 ...
##  $ year                : int  2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
# dplyr 로 동일하게 표현하면.. 
data1 <- data %>% 
    select(-c(2,3)) %>% 
    filter(year != 2018)

# 상관계수 출력
cor(data1)
##                      world_rank national_rank quality_of_education
## world_rank            1.0000000    0.23832107            0.6343275
## national_rank         0.2383211    1.00000000            0.1853993
## quality_of_education  0.6343275    0.18539934            1.0000000
## alumni_employment     0.6289032    0.10561614            0.5572212
## quality_of_faculty    0.6249865    0.19657009            0.7491310
## publications          0.9170506    0.32654389            0.5762741
## influence             0.8928859    0.16228501            0.6074201
## citations             0.8444044    0.19559639            0.5826268
## score                -0.5491533   -0.19105680           -0.5967928
## year                  0.1620879    0.04881068            0.2587159
##                      alumni_employment quality_of_faculty publications
## world_rank                   0.6289032          0.6249865    0.9170506
## national_rank                0.1056161          0.1965701    0.3265439
## quality_of_education         0.5572212          0.7491310    0.5762741
## alumni_employment            1.0000000          0.5005800    0.5318997
## quality_of_faculty           0.5005800          1.0000000    0.5907925
## publications                 0.5318997          0.5907925    1.0000000
## influence                    0.4780830          0.6224009    0.8626805
## citations                    0.5053959          0.6121459    0.8105269
## score                       -0.4941307         -0.6955724   -0.5199695
## year                         0.2744686          0.3609531    0.1606745
##                       influence  citations      score        year
## world_rank            0.8928859  0.8444044 -0.5491533  0.16208788
## national_rank         0.1622850  0.1955964 -0.1910568  0.04881068
## quality_of_education  0.6074201  0.5826268 -0.5967928  0.25871591
## alumni_employment     0.4780830  0.5053959 -0.4941307  0.27446860
## quality_of_faculty    0.6224009  0.6121459 -0.6955724  0.36095308
## publications          0.8626805  0.8105269 -0.5199695  0.16067453
## influence             1.0000000  0.8280916 -0.5223281  0.16077842
## citations             0.8280916  1.0000000 -0.5189880  0.17755460
## score                -0.5223281 -0.5189880  1.0000000 -0.20910887
## year                  0.1607784  0.1775546 -0.2091089  1.00000000
# 상관계수를 소수점 두자리로 출력
round(cor(data1), 2)
##                      world_rank national_rank quality_of_education
## world_rank                 1.00          0.24                 0.63
## national_rank              0.24          1.00                 0.19
## quality_of_education       0.63          0.19                 1.00
## alumni_employment          0.63          0.11                 0.56
## quality_of_faculty         0.62          0.20                 0.75
## publications               0.92          0.33                 0.58
## influence                  0.89          0.16                 0.61
## citations                  0.84          0.20                 0.58
## score                     -0.55         -0.19                -0.60
## year                       0.16          0.05                 0.26
##                      alumni_employment quality_of_faculty publications
## world_rank                        0.63               0.62         0.92
## national_rank                     0.11               0.20         0.33
## quality_of_education              0.56               0.75         0.58
## alumni_employment                 1.00               0.50         0.53
## quality_of_faculty                0.50               1.00         0.59
## publications                      0.53               0.59         1.00
## influence                         0.48               0.62         0.86
## citations                         0.51               0.61         0.81
## score                            -0.49              -0.70        -0.52
## year                              0.27               0.36         0.16
##                      influence citations score  year
## world_rank                0.89      0.84 -0.55  0.16
## national_rank             0.16      0.20 -0.19  0.05
## quality_of_education      0.61      0.58 -0.60  0.26
## alumni_employment         0.48      0.51 -0.49  0.27
## quality_of_faculty        0.62      0.61 -0.70  0.36
## publications              0.86      0.81 -0.52  0.16
## influence                 1.00      0.83 -0.52  0.16
## citations                 0.83      1.00 -0.52  0.18
## score                    -0.52     -0.52  1.00 -0.21
## year                      0.16      0.18 -0.21  1.00

Correlogram : Visualizing the correlation matrix

corrplot::corrplot()

R corrplot function is used to plot the graph of the correlation matrix.

The simplified format of the function is :

library(corrplot)
## corrplot 0.84 loaded
corr <- cor(data1)
corrplot(corr, method="circle")

  • Positive correlations are displayed in blue and negative correlations in red color.
  • Color intensity and the size of the circle are proportional to the correlation coefficients

Visualization methods

Seven different visualization methods can be used : “circle”, “square”, “ellipse”, “number”, “shade”, “color”, “pie”.

corrplot(corr, method="pie")

corrplot(corr, method="color") 

corrplot(corr, method="number") # Display the correlation coefficient

Types of correlogram layout

There are three types of layout :

  • “full” (default) : display full correlation matrix
  • “upper”: display upper triangular of the correlation matrix
  • “lower”: display lower triangular of the correlation matrix
corrplot(corr, type="upper")

corrplot(corr, type="lower")

Reordering the correlation matrix

The correlation matrix can be reordered according to the correlation coefficient. This is important to identify the hidden structure and pattern in the matrix. “hclust” for hierarchical clustering order is used in the following examples.

# correlogram with hclust reordering
corrplot(corr, type="upper", order="hclust")

Changing the color of the correlogram

As shown in the above section, the color of the correlogram can be customized. RcolorBrewer palette of colors are used in the R script below :

library(RColorBrewer)
corrplot(corr, type="upper", order="hclust",
         col=brewer.pal(n=8, name="PuOr"))

Changing the color of the correlogram

As shown in the above section, the color of the correlogram can be customized. RcolorBrewer palette of colors are used in the R script below :

corrplot(corr, type="upper", tl.col="black", tl.srt=45)

data <- tbl_df(data)
data
## # A tibble: 5,200 x 12
##    world_rank institution         country  national_rank quality_of_educa…
##         <int> <chr>               <chr>            <int>             <int>
##  1          1 Harvard University  USA                  1                 7
##  2          2 Massachusetts Inst… USA                  2                 9
##  3          3 Stanford University USA                  3                17
##  4          4 University of Camb… United …             1                10
##  5          5 California Institu… USA                  4                 2
##  6          6 Princeton Universi… USA                  5                 8
##  7          7 University of Oxfo… United …             2                13
##  8          8 Yale University     USA                  6                14
##  9          9 Columbia University USA                  7                23
## 10         10 University of Cali… USA                  8                16
## # ... with 5,190 more rows, and 7 more variables: alumni_employment <int>,
## #   quality_of_faculty <int>, publications <int>, influence <int>,
## #   citations <int>, score <dbl>, year <int>

dplyr가 제공하는 기능 중 특별한 건 아니고, 크기가 큰 데이터를 실수로 실행하게 되면, 모든 데이터가 console에 출력되면서 시간이 오래 걸리는데, 이를 방지하기 위해 데이터 일부만 보여주는 기능

Filter rows with filter()

Using comparisions operations

  • 비교연산자 : >, >=, <, <=, != (not equal), and == (equal)
  • = 과 == 의 차이 주의!
filter <- dplyr::filter
select <- dplyr::select
filter(data, world_rank == 1)
## # A tibble: 7 x 12
##   world_rank institution        country national_rank quality_of_education
##        <int> <chr>              <chr>           <int>                <int>
## 1          1 Harvard University USA                 1                    7
## 2          1 Harvard University USA                 1                    1
## 3          1 Harvard University USA                 1                    1
## 4          1 Harvard University USA                 1                    1
## 5          1 Harvard University USA                 1                    1
## 6          1 Harvard University USA                 1                    1
## 7          1 Harvard University USA                 1                    2
## # ... with 7 more variables: alumni_employment <int>,
## #   quality_of_faculty <int>, publications <int>, influence <int>,
## #   citations <int>, score <dbl>, year <int>
filter(data, country == "South Korea", world_rank < 100)
## # A tibble: 8 x 12
##   world_rank institution        country   national_rank quality_of_educat…
##        <int> <chr>              <chr>             <int>              <int>
## 1         75 Seoul National Un… South Ko…             1                101
## 2         40 Seoul National Un… South Ko…             1                101
## 3         24 Seoul National Un… South Ko…             1                355
## 4         24 Seoul National Un… South Ko…             1                367
## 5         98 Yonsei University  South Ko…             2                367
## 6         24 Seoul National Un… South Ko…             1                378
## 7         23 Seoul National Un… South Ko…             1                383
## 8         60 Seoul National Un… South Ko…             1                 NA
## # ... with 7 more variables: alumni_employment <int>,
## #   quality_of_faculty <int>, publications <int>, influence <int>,
## #   citations <int>, score <dbl>, year <int>
# 결측치 합계 확인
sum(is.na(data))
## [1] 1328
# 변수별 결측치 갯수(합계) 확인
sapply(data, function(x){sum(is.na(x))})
##           world_rank          institution              country 
##                    0                    0                    0 
##        national_rank quality_of_education    alumni_employment 
##                    0                  597                    0 
##   quality_of_faculty         publications            influence 
##                  731                    0                    0 
##            citations                score                 year 
##                    0                    0                    0
gdata <- group_by(data, year)
clean_data <- data %>% filter(year != 2018)
sapply(clean_data, function(x){sum(is.na(x))})  
##           world_rank          institution              country 
##                    0                    0                    0 
##        national_rank quality_of_education    alumni_employment 
##                    0                    0                    0 
##   quality_of_faculty         publications            influence 
##                    0                    0                    0 
##            citations                score                 year 
##                    0                    0                    0

Pick variables by their names (select()).

data %>% select(country)
## # A tibble: 5,200 x 1
##    country       
##    <chr>         
##  1 USA           
##  2 USA           
##  3 USA           
##  4 United Kingdom
##  5 USA           
##  6 USA           
##  7 United Kingdom
##  8 USA           
##  9 USA           
## 10 USA           
## # ... with 5,190 more rows
data %>% select(quality_of_education:citations)
## # A tibble: 5,200 x 6
##    quality_of_education alumni_employment quality_of_faculty publications
##                   <int>             <int>              <int>        <int>
##  1                    7                 9                  1            1
##  2                    9                17                  3           12
##  3                   17                11                  5            4
##  4                   10                24                  4           16
##  5                    2                29                  7           37
##  6                    8                14                  2           53
##  7                   13                28                  9           15
##  8                   14                31                 12           14
##  9                   23                21                 10           13
## 10                   16                52                  6            6
## # ... with 5,190 more rows, and 2 more variables: influence <int>,
## #   citations <int>

Top 5 대학 변동 추이

data %>% 
    group_by(year) %>% 
    select(year, institution, world_rank) %>% 
    top_n(-5, wt = world_rank) %>%
    ggplot(aes(x = year, y = world_rank, group = institution)) +
    geom_line(aes(color=institution)) +
    geom_point(aes(shape=institution, color=institution)) + 
    theme_bw() +
    labs(x="Year", y="World Rank", 
         title="World Ranks (2012-2018)",
         subtitle="Best World ranked Universities by CWUR") 

# data %>%
#     select(world_rank, institution, year) %>%
#     filter(institution == "California Institute of Technology")

Interactive geo plot

data %>% group_by(country,year) %>% 
    summarise(nr = length(world_rank), minw=min(world_rank), maxw=max(world_rank), avgw=round(mean(world_rank),0)) %>%
    select(country, year, nr, minw, maxw, avgw) %>% 
    ungroup() -> ccwur

# light grey boundaries
#l <- list(color = toRGB("grey"), width = 0.5)
ccwur$hover <- with(ccwur, 
        paste("Country: ", country, '<br>', 
              "Year: ",year, "<br>",
              "Universities in top: ", nr, "<br>",
              "Min rank in top: ", minw, "<br>",
              "Max rank in top: ", maxw, "<br>",
              "Mean rank in top: ", avgw,"<br>"
              ))
# specify map projection/options
g <- list(
  showframe = TRUE,
  showcoastlines = TRUE,
  projection = list(type = 'orthogonal')
)

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot_geo(ccwur, locationmode = 'country names') %>%
  add_trace(
    z = ~nr, color = ~nr, colors = 'Spectral', frame = ~year,
    text = ~hover, locations=~country) %>%
  colorbar(title = 'Number of\nuniversities in top', tickprefix = '') %>%
  layout(
    title = with(ccwur, paste('Number of universities in top')),
    geo = g
  )